Testing librosa tempo detection
import numpy as np
from matplotlib import pyplot as plt
import IPython.display as ipd
import librosa
import librosa.display
import math
import glob
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
Let's load a snippet of a song.
TEST_CASE_INDEX = 1
test_cases = [
# these are all pretty good
{'src': '**/*Ed*', 'known_tempo': 126, 'start': 60.34, 'len': 2.5 },
{'src': '**/*Ed*', 'known_tempo': 126, 'start': 60.34, 'len': 5 },
{'src': '**/*Ed*', 'known_tempo': 126, 'start': 60.34, 'len': 10 },
{'src': '**/*Ed*', 'known_tempo': 126, 'start': 60.34, 'len': 20 },
# this song has more complicated rhythm, first case is 1.33x tempo, and sounds triplet feel.
# second is closer to tempo, but still off sounding. 3rd, long 20s window, sounds great
{'src': '**/*Dua*Rules*', 'known_tempo': 116, 'start': 60.80, 'len': 5 },
{'src': '**/*Dua*Rules*', 'known_tempo': 116, 'start': 60.80, 'len': 10 },
{'src': '**/*Dua*Rules*', 'known_tempo': 116, 'start': 80.80, 'len': 20 },
#
# simple metronomes
{'src': '**/*126 BPM*', 'known_tempo': 126, 'start': 22.8, 'len': 5 },
{'src': '**/*120 BPM*', 'known_tempo': 120, 'start': 10, 'len': 5 },
]
test_case = test_cases[TEST_CASE_INDEX]
test_case['end'] = test_case['start'] + test_case['len']
src = glob.glob(test_case['src'])[0]
y, sr = librosa.load(src, sr=48000, offset=test_case['start'], duration=test_case['len'])
ipd.display(pd.DataFrame([[sr, len(y), len(y.shape)]],
columns=["Sample rate Hz", "Num Samples", "Channels"]).style.hide())
ipd.Audio(y, rate=sr)
| Sample rate Hz | Num Samples | Channels |
|---|---|---|
| 48000 | 240000 | 1 |
Now use built in beat track method, and then generate a click track at those detected beats and overlay that click audio onto detected segment.
Librosa beat_track returns two outputs, the assumed detected tempo or BPM (beats per minute), as well as an array of detected beat events (in seconds time or sample time). As far as BPM specifically, we can compare the reported tempo to the tempo(s) inferred by the times (e.g. average) between these detected beat events.
Also note, depending on test audio input source, the number of detected beat events is often lower than expected (both if you listen for kick drum or clicks in the sample audio vs inferred number of beats from known tempo and snippet length).
HOP_LENGTH = 256
def predict_beats(y, sr):
predict_env = librosa.onset.onset_strength_multi
onset_env = predict_env(y=y, sr=sr,
hop_length=HOP_LENGTH,
aggregate=np.median, # default is mean
lag=1, # default, unit? "time lag for computing differences"
max_size=1, # default, do not filter freq bins
detrend=False, # default, do not "filter onset strength to remove DC component"
center=True, # Centered frame analysis in STFT, by hop length
)
onset_env = onset_env[..., 0, :]
# HOP_LENGTH = 512
# predict_env = librosa.onset.onset_strength
# onset_env = predict_env(y=y, sr=sr,
# # hop_length=HOP_LENGTH,
# aggregate=np.median, # default is mean
# lag=1, # default, unit? "time lag for computing differences"
# max_size=1, # default, do not filter freq bins
# detrend=False, # default, do not "filter onset strength to remove DC component"
# center=True, # Centered frame analysis in STFT, by hop length
# )
predict_beats = librosa.beat.beat_track
return predict_beats(onset_envelope=onset_env, sr=sr, units='time',
hop_length=HOP_LENGTH,
tightness=1000, # yikers island, what does this do... good? 800 1000, bad 400 600 1600
# start_bpm=126,
# trim=False,
)
reported_tempo, beats = predict_beats(y, sr)
expected_beats = math.floor(test_case['known_tempo'] * test_case['len'] / 60.0)
ipd.display(pd.DataFrame([
['Reported tempo', reported_tempo],
['Averaged tempo', 60 / np.average(np.diff(beats))],
['Num beats detected vs expected', f"{len(beats)} vs {expected_beats}"],
]).style.hide(axis="columns").hide())
click_track = librosa.clicks(times=beats, sr=sr, length=len(y))
ipd.Audio(y + click_track, rate=sr)
| Reported tempo | 126.404494 |
| Averaged tempo | 126.720901 |
| Num beats detected vs expected | 10 vs 10 |
In the past, the reported tempo from Librosa was often not as good as the one indicated by the detected beats, not sure what may have changed regarding my inputs / knob fiddling, nor what the overall status is like now.
pd.DataFrame([
['Reported', reported_tempo],
['Averaged', 60 / np.average(np.diff(beats))],
['Min', 60 / np.max(np.diff(beats))],
['Max', 60 / np.min(np.diff(beats))],
['Median', 60 / np.median(np.diff(beats))],
['-','-'],
['Known', test_case['known_tempo']],
['Known seconds per beat', 60 / test_case['known_tempo'] ],
['Averaged seconds per beat', np.average(np.diff(beats))],
], columns=["Method", "BPM"]).style.hide()
| Method | BPM |
|---|---|
| Reported | 126.404494 |
| Averaged | 126.720901 |
| Min | 125.000000 |
| Max | 130.813953 |
| Median | 126.404494 |
| - | - |
| Known | 126 |
| Known seconds per beat | 0.476190 |
| Averaged seconds per beat | 0.473481 |
Now we get into using the detected past time window's beats and applying it to recent future audio.
Let's use that prediction and overlay the would be assumed beats onto the next chunk of the track and see how it sounds. This first method is to duplicate and add the beat events shifted over.
# first load twice as much audio
duration = 2.0 * test_case['len']
future, _ = librosa.load(src, sr=sr, offset=test_case['start'], duration=duration)
future_beats = np.array(beats)
diffs = np.diff(beats)
beats_added = 0
while future_beats[-1] < duration:
future_beats = np.append(future_beats, future_beats[-1] + diffs[beats_added % len(diffs)])
beats_added = beats_added + 1
future_click = librosa.clicks(times=future_beats, sr=sr, length=len(future))
ipd.Audio(future + future_click, rate=sr)
22
Or choose just a single beat as anchor (e.g. first or last one) and use one of the reported/derived tempos as constant spacing to create overlayed click track. Perhaps, it's here that implementing decent fitting / selecting of both tempo and anchor could yield better results overall...
spb = np.average(np.diff(beats))
constant_bpm_clicks = [beats[0]]
while constant_bpm_clicks[-1] < test_case['end']:
constant_bpm_clicks = constant_bpm_clicks + [ constant_bpm_clicks[-1] + spb ]
ipd.Audio(future + librosa.clicks(times=constant_bpm_clicks, sr=sr, length=len(future)), rate=sr)
# tempo_used = 60 / np.average(np.diff(beats))
tempo_used = reported_tempo
spb = 60 / tempo_used
beat1_ests = [beat - k * spb for k, beat in enumerate(beats)]
anchor = np.mean(beat1_ests)
beat_ests = np.arange(anchor, duration, spb)
constant_bpm_clicks = [anchor]
while constant_bpm_clicks[-1] < test_case['end']:
constant_bpm_clicks = constant_bpm_clicks + [ constant_bpm_clicks[-1] + spb ]
ipd.Audio(future + librosa.clicks(times=constant_bpm_clicks, sr=sr, length=len(future)), rate=sr)
These might be good enough?
With minimal testing (read: a few cases... 😬), either sounds OK?
Improve the above Librosa calls (try other params?), test with multiple sound files (especially ones with multiple songs started, stoppped, transitioned to, etc) and past time windows. (These now ancient notes indicate we can probably improve using the librosa output with even basic / brute-force overlay fitting of the detected beat tempo + sample times?).
Apply "smoothing". This is where statistics / math nerds might be able to quickly help? I imagine, if I get all the above working, the predictions will vary in accuracy, plus we must remember that anyone can stop a currently playing song and play another one of entirely different tempo. It's luckily not the end of the world if the beat sync is wildly off (I hope), especially for a short time, but it would nice if we could strike a balance that weights the last N calcs / M minutes of tempos, with abrupt changes (stops, starts, new songs, etc.).
More on smoothing, the Dua Lipa track, it sounds like when I tried different window variance it may detect the same beats but like "dotted" / 1.5x type spacing, both BPM and beats. So I wonder if that's able to be factored into (probably more and more non-trivial) smoothing / combining techniques. And another smoothing related thought: could try multiple length windows and weight them somehow (e.g. 5s, 10s, 20s, 60s into the past)?
Add visual rendering of waveform, with past window's beat detections and future predictions marked.
Use real time audio stream. Obviously, this notebook uses pre-baked audio files for quick demo / testing. In this repo, I've started code that uses Python audio lib(s) to listen to a real time audio device stream (like what would be played live, for beat detection and sync), store the samples in a ring buffer, and use the same librosa code to run short-windowed-into-the-past beat detection aglos. Needs revisit + clean up. IN PROGRESS
Take that prediction output, and make sure it is sample time syncrhonized / accurate with the real time audio input samples and wall-clock time.
Fire OSC LX Studio compatible events synchronized to predictions.
Create LX Studio beat specific FX, namely to test + demo. (Can be done in parallel, before / while actual tempo improvements are being made). DONE
a = [1.3888, 2, 3.37, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 9, 8, 7, 0, 10, 11]
# np.ediff1d(np.r_[0, a == 0, 0]).nonzero()[0].reshape(-1, 2)
def find_zero_runs(a):
# Create an array that is 1 where a is 0, and pad each end with an extra 0.
iszero = np.concatenate(([0], np.equal(a, 0), [0]))
absdiff = np.abs(np.diff(iszero))
# Runs start and end where absdiff is 1.
ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
return ranges
zr = find_zero_runs(a)
zr = list(map(list, list(zr)))
ipd.display(zr)
zr = list(filter(lambda x: x[1] - x[0] > 2, zr))
zr
[[3, 9], [12, 16], [19, 20]]
[[3, 9], [12, 16]]
a = [1.3888, 2, 3.37, -1, -2, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 9, 8, 7, 0, 10, 11]
a = np.sign(a)
a = np.diff(a)
a
array([ 0., 0., -2., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
-1., 0., 0., 0., 1., 0., 0., -1., 1., 0.])
y, sr = librosa.load('/tmp/sigh.wav', sr=48000)
y
--------------------------------------------------------------------------- LibsndfileError Traceback (most recent call last) File /opt/conda/lib/python3.10/site-packages/librosa/core/audio.py:176, in load(path, sr, mono, offset, duration, dtype, res_type) 175 try: --> 176 y, sr_native = __soundfile_load(path, offset, duration, dtype) 178 except sf.SoundFileRuntimeError as exc: 179 # If soundfile failed, try audioread instead File /opt/conda/lib/python3.10/site-packages/librosa/core/audio.py:209, in __soundfile_load(path, offset, duration, dtype) 207 else: 208 # Otherwise, create the soundfile object --> 209 context = sf.SoundFile(path) 211 with context as sf_desc: File /opt/conda/lib/python3.10/site-packages/soundfile.py:658, in SoundFile.__init__(self, file, mode, samplerate, channels, subtype, endian, format, closefd) 656 self._info = _create_info_struct(file, mode, samplerate, channels, 657 format, subtype, endian) --> 658 self._file = self._open(file, mode_int, closefd) 659 if set(mode).issuperset('r+') and self.seekable(): 660 # Move write position to 0 (like in Python file objects) File /opt/conda/lib/python3.10/site-packages/soundfile.py:1216, in SoundFile._open(self, file, mode_int, closefd) 1215 err = _snd.sf_error(file_ptr) -> 1216 raise LibsndfileError(err, prefix="Error opening {0!r}: ".format(self.name)) 1217 if mode_int == _snd.SFM_WRITE: 1218 # Due to a bug in libsndfile version <= 1.0.25, frames != 0 1219 # when opening a named pipe in SFM_WRITE mode. 1220 # See http://github.com/erikd/libsndfile/issues/77. LibsndfileError: Error opening '/tmp/sigh.wav': System error. During handling of the above exception, another exception occurred: FileNotFoundError Traceback (most recent call last) Cell In[9], line 1 ----> 1 y, sr = librosa.load('/tmp/sigh.wav', sr=48000) 2 y File /opt/conda/lib/python3.10/site-packages/librosa/core/audio.py:184, in load(path, sr, mono, offset, duration, dtype, res_type) 180 if isinstance(path, (str, pathlib.PurePath)): 181 warnings.warn( 182 "PySoundFile failed. Trying audioread instead.", stacklevel=2 183 ) --> 184 y, sr_native = __audioread_load(path, offset, duration, dtype) 185 else: 186 raise exc File /opt/conda/lib/python3.10/site-packages/decorator.py:232, in decorate.<locals>.fun(*args, **kw) 230 if not kwsyntax: 231 args, kw = fix(args, kw, sig) --> 232 return caller(func, *(extras + args), **kw) File /opt/conda/lib/python3.10/site-packages/librosa/util/decorators.py:60, in deprecated.<locals>.__wrapper(func, *args, **kwargs) 51 """Warn the user, and then proceed.""" 52 warnings.warn( 53 "{:s}.{:s}\n\tDeprecated as of librosa version {:s}." 54 "\n\tIt will be removed in librosa version {:s}.".format( (...) 58 stacklevel=3, # Would be 2, but the decorator adds a level 59 ) ---> 60 return func(*args, **kwargs) File /opt/conda/lib/python3.10/site-packages/librosa/core/audio.py:241, in __audioread_load(path, offset, duration, dtype) 238 reader = path 239 else: 240 # If the input was not an audioread object, try to open it --> 241 reader = audioread.audio_open(path) 243 with reader as input_file: 244 sr_native = input_file.samplerate File /opt/conda/lib/python3.10/site-packages/audioread/__init__.py:127, in audio_open(path, backends) 125 for BackendClass in backends: 126 try: --> 127 return BackendClass(path) 128 except DecodeError: 129 pass File /opt/conda/lib/python3.10/site-packages/audioread/rawread.py:59, in RawAudioFile.__init__(self, filename) 58 def __init__(self, filename): ---> 59 self._fh = open(filename, 'rb') 61 try: 62 self._file = aifc.open(self._fh) FileNotFoundError: [Errno 2] No such file or directory: '/tmp/sigh.wav'
sr
zr = find_zero_runs(y)
zr
zero_runs = list(map(list, zr))
if len(zero_runs) >= 4:
bpm = 60 / ((zero_runs[2][1] - zero_runs[1][1]) / sr)
beep_len = ((zero_runs[2][0] - zero_runs[1][1] + 1) / sr)
bpm
beep_len